---
title: "Processing"
author: "Matthew B. Platt"
date: "October 23, 2020"
output: html_document
---

## Set up

- Define chunk behavior
- Load libraries

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

# Load libraries
library(tidyverse)
library(poliscidata)
library(readxl)


```

- Load data

```{r load-data}

# Read in the original data file. Use a relative path to specify that the data is in
# the "OriginalData" folder.

thesisdata <- read_excel("../../Data/OriginalData/thesisdata.xlsx", sheet = "stuff", col_names = T)

electivestack <- read.csv("../../Data/OriginalData/electivestack.csv")


```

## Data Wrangling

### Create new variables

New variables include:

- `score`
- `scorecount`
- `prereq`
- `allelect`

```{r new-variables}

#make "score", "sourcecount", and "prereq" variables
thesisdata <- mutate(thesisdata,
                     sourcecount = as.numeric(sources >= 7), # did the thesis use more than 7 sources?
                     score = topicmatch + concepts + litcritique + hypothesis + primarydata + dataviz + blackpol,
                     prereq = as.numeric(scope == 1&electives == 4),
                     allelect = as.numeric(electives == 4)
                     )

```

### Create Summary Data

The paper makes use of the following summary datasets:

- `sumthesis1`

```{r summary}

# summary by term, year, and semfield
sumthesis1 <- group_by(thesisdata, term, ayear, semfield) %>%
  summarise(count = n(),
            submits = sum(submitted, na.rm=T),
            grades = sum(graded, na.rm=T),
            incompletes = sum(incomplete, na.rm = T),
            prerequisites = sum(prereq, na.rm = T),
            avgscore = mean(score, na.rm = T),
            avgelective = mean(electives, na.rm=T),
            scopecount = sum(scope, na.rm=T),
            avgsource = mean(sources, na.rm=T),
            avgpage = mean(pages, na.rm = T),
            matched = sum(topicmatch, na.rm = T),
            concept = sum(concepts, na.rm = T),
            evals = sum(litcritique, na.rm = T),
            hypotheses = sum(hypothesis, na.rm = T),
            primary = sum(primarydata, na.rm = T),
            visualization = sum(dataviz, na.rm = T),
            blackpols = sum(blackpol, na.rm = T),
            cumgpa = mean(gpa, na.rm = T))

```




## Save analysis dataset

```{r save-data}

# save the modified data in a file called "analysis.RData"
# use a relative directory path to save the file to the
# "Data" folder

save(thesisdata, electivestack, sumthesis1, file = "../../Data/AnalysisData/analysis.RData")

```



